In [5]:
# import libraries
from datetime import datetime, timedelta
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from __future__ import division
import chart_studio.plotly as py
#import plotly.plotly as py
import plotly.offline as pyoff
import plotly.graph_objs as go
from sklearn.cluster import KMeans
#initiate visualization library for jupyter notebook 
pyoff.init_notebook_mode()
import os
from sklearn.cluster import KMeans
from sklearn.feature_selection import VarianceThreshold
 
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
In [6]:
# define the derictories where all dump 
os.chdir(r'C:\Users\mainak.kundu\Desktop\REVOLT')
# read the data 
rev_user = pd.read_csv('rev-users.csv')
rev_trns = pd.read_csv('rev-transactions.csv')
rev_dvc = pd.read_csv('rev-devices.csv')
rev_notfc = pd.read_csv('rev-notifications.csv')
# shape the data
print('user_data:',rev_user.shape)
print('transaction_data:',rev_trns.shape)
print('device_data:',rev_dvc.shape)
print('notification_data:',rev_notfc.shape)
('user_data:', (19430, 12))
('transaction_data:', (2740075, 12))
('device_data:', (19430, 2))
('notification_data:', (121813, 5))
In [7]:
#converting the type of Invoice Date Field from string to datetime.
rev_notfc.rename(columns = {'created_date':'notification_date'}, inplace = True) ## change the column name of created_date 
rev_trns.rename(columns = {'created_date':'transaction_date'}, inplace = True) ## change the column name of created_date
rev_trns['transaction_date'] = pd.to_datetime(rev_trns['transaction_date'])

#creating YearMonth field for the ease of reporting and visualization
rev_trns['yr_mnth'] = rev_trns['transaction_date'].map(lambda date: 100*date.year + date.month)

EDA- Avg amounts month by month

In [9]:
from matplotlib import rcParams
# figure size in inches
rcParams['figure.figsize'] = 11.7,8.27
tx_revenue = rev_trns.groupby(['yr_mnth'])['amount_usd'].mean().reset_index()
sns.barplot(tx_revenue['yr_mnth'],tx_revenue['amount_usd'])
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x2145d240>
In [10]:
tx_revenue.head(2)
Out[10]:
yr_mnth amount_usd
0 201801 144.936517
1 201802 116.127118
In [11]:
# Monthly active user 
#creating monthly active customers dataframe by counting unique Customer IDs
tx_monthly_active = rev_trns.groupby('yr_mnth')['user_id'].nunique().reset_index()

#print the dataframe
tx_monthly_active

#plotting the output
plot_data = [
    go.Bar(
        x=tx_monthly_active['yr_mnth'],
        y=tx_monthly_active['user_id'],
    )
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly Active Customers'
    )

fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [12]:
#create a dataframe contaning CustomerID and first purchase date
tx_min_purchase = rev_trns.groupby('user_id').transaction_date.min().reset_index()
tx_min_purchase.columns = ['user_id','min_purchase_date']
tx_min_purchase['min_purchase_date'] = tx_min_purchase['min_purchase_date'].map(lambda date: 100*date.year + date.month)
rev_trns = pd.merge(rev_trns, tx_min_purchase, on='user_id')
rev_trns.head()
Out[12]:
transaction_id transactions_type transactions_currency amount_usd transactions_state ea_cardholderpresence ea_merchant_mcc ea_merchant_city ea_merchant_country direction user_id transaction_date yr_mnth min_purchase_date
0 transaction_0 TRANSFER AED 4.55 COMPLETED NaN NaN NaN NaN OUTBOUND user_898 2018-04-03 03:34:21.784487 201804 201801
1 transaction_378 TRANSFER AED 14.47 COMPLETED NaN NaN NaN NaN OUTBOUND user_898 2018-04-08 04:43:53.933487 201804 201801
2 transaction_379 ATM AED 41.33 COMPLETED FALSE 6011.0 Abu Dhabi ARE OUTBOUND user_898 2018-04-07 18:41:07.029487 201804 201801
3 transaction_380 CARD_PAYMENT AED 6.61 COMPLETED FALSE 5814.0 Dubai ARE OUTBOUND user_898 2018-04-04 23:33:57.894487 201804 201801
4 transaction_381 TRANSFER AED 0.21 COMPLETED NaN NaN NaN NaN OUTBOUND user_898 2018-04-04 19:39:31.674487 201804 201801
In [14]:
#create a column called User Type and assign Existing 
#if User's First Purchase Year Month before the selected Invoice Year Month
rev_trns['user_type'] = 'New'
rev_trns.loc[rev_trns['yr_mnth']>rev_trns['min_purchase_date'],'user_type'] = 'Existing'
In [15]:
#calculate the Revenue per month for each user type
tx_user_type_revenue = rev_trns.groupby(['yr_mnth','user_type'])['amount_usd'].mean().reset_index()
tx_user_type_revenue.head(2)
Out[15]:
yr_mnth user_type amount_usd
0 201801 New 144.936517
1 201802 Existing 142.147482
In [16]:
sns.pointplot(tx_user_type_revenue['yr_mnth'],tx_user_type_revenue['amount_usd'],hue=tx_user_type_revenue['user_type'])
Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x21e8f2b0>
In [18]:
#create a dataframe that shows new user ratio - we also need to drop NA values (first month new user ratio is 0)
tx_user_ratio = rev_trns.query("user_type == 'New'").groupby(['yr_mnth'])['user_id'].nunique()/rev_trns.query("user_type == 'Existing'").groupby(['yr_mnth'])['user_id'].nunique() 
tx_user_ratio = tx_user_ratio.reset_index()
tx_user_ratio = tx_user_ratio.dropna()
#figure(num=None, figsize=(15,15), dpi=48, facecolor='w', edgecolor='k')
sns.barplot(tx_user_ratio['yr_mnth'],tx_user_ratio['user_id'])
Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x38b782e8>
In [19]:
#identify which users are active by looking at their revenue per month
tx_user_purchase = rev_trns.groupby(['user_id','yr_mnth'])['amount_usd'].sum().reset_index()

#create retention matrix with crosstab
tx_retention = pd.crosstab(tx_user_purchase['user_id'], tx_user_purchase['yr_mnth']).reset_index()

tx_retention.head()

#create an array of dictionary which keeps Retained & Total User count for each month
months = tx_retention.columns[2:]
retention_array = []
for i in range(len(months)-1):
    retention_data = {}
    selected_month = months[i+1]
    prev_month = months[i]
    retention_data['yr_mnth'] = int(selected_month)
    retention_data['TotalUserCount'] = tx_retention[selected_month].sum()
    retention_data['RetainedUserCount'] = tx_retention[(tx_retention[selected_month]>0) & (tx_retention[prev_month]>0)][selected_month].sum()
    retention_array.append(retention_data)
    
#convert the array to dataframe and calculate Retention Rate
tx_retention = pd.DataFrame(retention_array)
tx_retention['RetentionRate'] = tx_retention['RetainedUserCount']/tx_retention['TotalUserCount']

#plot the retention rate graph
plot_data = [
    go.Scatter(
        x=tx_retention['yr_mnth'],
        y=tx_retention['RetentionRate'],
        name="organic"
    )
    
]

plot_layout = go.Layout(
        xaxis={"type": "category"},
        title='Monthly Retention Rate'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
In [20]:
tx_user_purchase = rev_trns.groupby(['user_id','yr_mnth'])['amount_usd'].sum().astype(int).reset_index()
tx_retention = pd.crosstab(tx_user_purchase['user_id'],tx_user_purchase['yr_mnth']).reset_index()
tx_retention.head()
Out[20]:
yr_mnth user_id 201801 201802 201803 201804 201805 201806 201807 201808 201809 201810 201811 201812 201901 201902 201903 201904 201905
0 user_0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
1 user_1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 0 0 0
2 user_10 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3 user_100 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 1
4 user_1000 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0

RFM -EDA

In [397]:
#create a generic user dataframe to keep CustomerID and new segmentation scores
tx_user = pd.DataFrame(rev_trns['user_id'].unique())
tx_user.columns = ['user_id']

#get the max purchase date for each customer and create a dataframe with it
tx_max_purchase = rev_trns.groupby('user_id').transaction_date.max().reset_index()
tx_max_purchase.columns = ['user_id','max_purchase_date']

#we take our observation point as the max invoice date in our dataset
tx_max_purchase['Recency'] = (tx_max_purchase['max_purchase_date'].max() - tx_max_purchase['max_purchase_date']).dt.days

#merge this dataframe to our new user dataframe
tx_user = pd.merge(tx_user, tx_max_purchase[['user_id','Recency']], on='user_id')

tx_user.head()
Out[397]:
user_id Recency
0 user_898 15
1 user_1652 6
2 user_5509 6
3 user_5496 13
4 user_4896 10
In [398]:
tx_user['Recency'].describe()
Out[398]:
count    18766.000000
mean        69.865768
std        101.067781
min          0.000000
25%          6.000000
50%         18.000000
75%         92.000000
max        498.000000
Name: Recency, dtype: float64

Apply k-means clustering on Recency

In [399]:
from sklearn.cluster import KMeans

sse={}
tx_recency = tx_user[['Recency']]
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, max_iter=1000).fit(tx_recency)
    tx_recency["clusters"] = kmeans.labels_
    sse[k] = kmeans.inertia_ 
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.show() ## 2 or 3(max) clusters are enough 
In [400]:
#build 4 clusters for recency and add it to dataframe
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])

#function for ordering cluster numbers
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final

tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
#see details of each cluster
tx_user.groupby('RecencyCluster')['Recency'].describe()
Out[400]:
count mean std min 25% 50% 75% max
RecencyCluster
0 1617.0 333.107607 64.025073 247.0 278.0 323.0 375.0 498.0
1 3138.0 159.949649 43.975353 90.0 123.0 156.0 196.0 246.0
2 14011.0 19.309400 20.098607 0.0 5.0 11.0 24.0 89.0

Frequency

In [401]:
#get order counts for each user and create a dataframe with it
tx_frequency = rev_trns.groupby('user_id').transaction_date.count().reset_index()
tx_frequency.columns = ['user_id','Frequency']

#add this data to our main dataframe
tx_user = pd.merge(tx_user, tx_frequency, on='user_id')
In [402]:
#k-means
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])

#order the frequency cluster
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)

#see details of each cluster
tx_user.groupby('FrequencyCluster')['Frequency'].describe()
Out[402]:
count mean std min 25% 50% 75% max
FrequencyCluster
0 15674.0 64.962996 63.612454 1.0 13.0 43.0 99.00 251.0
1 2676.0 438.843049 158.166826 252.0 308.0 393.0 533.00 876.0
2 416.0 1316.108173 494.551846 880.0 989.5 1175.5 1441.75 5285.0

Revenue

In [403]:
#calculate revenue for each customer
tx_revenue = rev_trns.groupby('user_id').amount_usd.sum().reset_index()

#merge it with our main dataframe
tx_user = pd.merge(tx_user, tx_revenue, on='user_id')
In [404]:
tx_user.head()
Out[404]:
user_id Recency RecencyCluster Frequency FrequencyCluster amount_usd
0 user_898 15 2 261 1 6507.06
1 user_1652 6 2 588 1 130209.34
2 user_4896 10 2 268 1 5344.39
3 user_2571 6 2 460 1 40203.17
4 user_1293 10 2 330 1 64000.82
In [405]:
#apply clustering
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['amount_usd']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['amount_usd']])


#order the cluster numbers
tx_user = order_cluster('RevenueCluster', 'amount_usd',tx_user,True)

#show details of the dataframe
tx_user.groupby('RevenueCluster')['amount_usd'].describe()
Out[405]:
count mean std min 25% 50% 75% max
RevenueCluster
0 18760.0 4.705152e+06 1.935126e+08 0.000000e+00 4.695825e+02 2.543275e+03 8.197287e+03 1.532104e+10
1 3.0 3.895038e+10 1.094085e+10 2.978726e+10 3.289363e+10 3.600001e+10 4.353194e+10 5.106388e+10
2 3.0 8.719220e+10 5.677886e+09 8.285213e+10 8.397932e+10 8.510651e+10 8.936224e+10 9.361796e+10
In [407]:
#calculate overall score and use mean() to see details
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user.groupby('OverallScore')['Recency','Frequency','amount_usd'].mean()
Out[407]:
Recency Frequency amount_usd
OverallScore
0 333.306832 15.078882 1.252372e+03
1 160.473413 27.527876 2.532991e+03
2 22.767742 84.190550 3.781322e+06
3 8.940662 439.726132 4.011826e+07
4 5.887560 1308.942584 5.406320e+08
5 3.000000 684.000000 9.361796e+10

Inference - 0 is the wrost and 5 is the best

In [408]:
tx_user['segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>=1,'segment'] = 'Mid-Value' 
tx_user.loc[tx_user['OverallScore']>3,'segment'] = 'High-Value'
In [420]:
tx_user.groupby('segment')['Recency','Frequency','amount_usd'].mean()
Out[420]:
Recency Frequency amount_usd
segment
High-Value 5.880668 1307.451074 7.627736e+08
Low-Value 333.306832 15.078882 1.252372e+03
Mid-Value 46.126128 129.531935 8.788446e+06
In [417]:
sns.scatterplot(tx_user['Recency'],tx_user['Frequency'],hue=tx_user['segment'])
Out[417]:
<matplotlib.axes._subplots.AxesSubplot at 0x20710a0dd30>
In [419]:
sns.scatterplot(tx_user['Frequency'],tx_user['amount_usd'],hue=tx_user['segment'])
Out[419]:
<matplotlib.axes._subplots.AxesSubplot at 0x2070d73f5c0>

Data Prepration

In [247]:
import xgboost as xgb
from sklearn.model_selection import KFold, cross_val_score, train_test_split
In [248]:
rev_trns['transaction_date'].max(),rev_trns['transaction_date'].min()
Out[248]:
(Timestamp('2019-05-16 18:22:16.959896'),
 Timestamp('2018-01-01 08:51:10.248709'))

Create 2 dataframe one is 12months and another is 5months, we will predict revenue on 5th Months with 12months

In [249]:
#create 12m and 5m dataframes
# Put a filter and segregate last 2 months data who are transacted 
start_date = '2018-01-01'
end_date = '2018-12-30'
mask = (rev_trns['transaction_date'] > start_date) & (rev_trns['transaction_date'] <= end_date) 
tx_12m = rev_trns.loc[mask]
tx_12m.shape
Out[249]:
(1384853, 13)
In [250]:
start_date = '2019-01-01'
end_date = '2019-05-16'
mask = (rev_trns['transaction_date'] > start_date) & (rev_trns['transaction_date'] <= end_date) 
tx_5m = rev_trns.loc[mask]
tx_5m.shape
Out[250]:
(1337714, 13)
In [251]:
#create tx_user for assigning clustering
tx_user = pd.DataFrame(tx_12m['user_id'].unique())
tx_user.columns = ['user_id']
In [252]:
#order cluster method
def order_cluster(cluster_field_name, target_field_name,df,ascending):
    new_cluster_field_name = 'new_' + cluster_field_name
    df_new = df.groupby(cluster_field_name)[target_field_name].mean().reset_index()
    df_new = df_new.sort_values(by=target_field_name,ascending=ascending).reset_index(drop=True)
    df_new['index'] = df_new.index
    df_final = pd.merge(df,df_new[[cluster_field_name,'index']], on=cluster_field_name)
    df_final = df_final.drop([cluster_field_name],axis=1)
    df_final = df_final.rename(columns={"index":cluster_field_name})
    return df_final
In [253]:
tx_12m.head(2)
Out[253]:
transaction_id transactions_type transactions_currency amount_usd transactions_state ea_cardholderpresence ea_merchant_mcc ea_merchant_city ea_merchant_country direction user_id transaction_date yr_mnth
0 transaction_0 TRANSFER AED 4.55 COMPLETED NaN NaN NaN NaN OUTBOUND user_898 2018-04-03 03:34:21.784487 201804
12 transaction_12 TRANSFER AED 61.99 COMPLETED NaN NaN NaN NaN OUTBOUND user_5496 2018-09-07 03:41:24.499256 201809
In [254]:
#calculate recency score
tx_max_purchase = tx_12m.groupby('user_id').transaction_date.max().reset_index()
tx_max_purchase.columns = ['user_id','MaxPurchaseDate']
tx_max_purchase['Recency'] = (tx_max_purchase['MaxPurchaseDate'].max() - tx_max_purchase['MaxPurchaseDate']).dt.days
tx_user = pd.merge(tx_user, tx_max_purchase[['user_id','Recency']], on='user_id')
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Recency']])
tx_user['RecencyCluster'] = kmeans.predict(tx_user[['Recency']])
tx_user = order_cluster('RecencyCluster', 'Recency',tx_user,False)
tx_user.head(2)
Out[254]:
user_id Recency RecencyCluster
0 user_898 6 2
1 user_5496 0 2
In [255]:
#calcuate frequency score
tx_frequency = tx_12m.groupby('user_id').transaction_date.count().reset_index()
tx_frequency.columns = ['user_id','Frequency']
tx_user = pd.merge(tx_user, tx_frequency, on='user_id')
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Frequency']])
tx_user['FrequencyCluster'] = kmeans.predict(tx_user[['Frequency']])
tx_user = order_cluster('FrequencyCluster', 'Frequency',tx_user,True)
tx_user.head(2)
Out[255]:
user_id Recency RecencyCluster Frequency FrequencyCluster
0 user_898 6 2 242 1
1 user_4896 3 2 153 1
In [256]:
#calcuate revenue score
tx_12m['Revenue'] = tx_12m['amount_usd']
tx_revenue = tx_12m.groupby('user_id').Revenue.sum().reset_index()
tx_user = pd.merge(tx_user, tx_revenue, on='user_id')
kmeans = KMeans(n_clusters=3)
kmeans.fit(tx_user[['Revenue']])
tx_user['RevenueCluster'] = kmeans.predict(tx_user[['Revenue']])
tx_user = order_cluster('RevenueCluster', 'Revenue',tx_user,True)
tx_user.head(2)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Out[256]:
user_id Recency RecencyCluster Frequency FrequencyCluster Revenue RevenueCluster
0 user_898 6 2 242 1 6274.41 0
1 user_4896 3 2 153 1 4066.90 0
In [257]:
#overall scoring
tx_user['OverallScore'] = tx_user['RecencyCluster'] + tx_user['FrequencyCluster'] + tx_user['RevenueCluster']
tx_user['segment'] = 'Low-Value'
tx_user.loc[tx_user['OverallScore']>3,'segment'] = 'High-Value'
In [258]:
tx_user.segment.value_counts()
Out[258]:
Low-Value     17464
High-Value      361
Name: segment, dtype: int64
In [259]:
(359/17466)*100
Out[259]:
2.055421962670331

Test data prep

In [260]:
#calculate revenue and create a new dataframe for it
tx_5m['Revenue'] = tx_5m['amount_usd']
tx_user_5m = tx_5m.groupby('user_id')['Revenue'].sum().reset_index()
tx_user_5m.columns = ['user_id','m5_Revenue']
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Merge our 12months and 5months into a single dataframe

In [261]:
tx_user_5m.head()
Out[261]:
user_id m5_Revenue
0 user_0 5743.62
1 user_1 1050.87
2 user_10 12577.26
3 user_100 6495.30
4 user_10000 7034.78
In [262]:
tx_merge = pd.merge(tx_user, tx_user_5m, on='user_id', how='left')
tx_merge = tx_merge.fillna(0)
tx_merge.head()
Out[262]:
user_id Recency RecencyCluster Frequency FrequencyCluster Revenue RevenueCluster OverallScore segment m5_Revenue
0 user_898 6 2 242 1 6274.41 0 3 Low-Value 232.65
1 user_4896 3 2 153 1 4066.90 0 3 Low-Value 1277.49
2 user_2571 0 2 299 1 14192.10 0 3 Low-Value 25876.60
3 user_3705 0 2 191 1 23439.04 0 3 Low-Value 15738.90
4 user_1293 5 2 207 1 58078.51 0 3 Low-Value 5922.31

Considering business part of this analysis, we need to treat customers differently based on their predicted LTV. For this example, we will apply clustering and have 2 segments (number of segments really depends on your business dynamics and goals)

In [263]:
#remove outliers
tx_merge = tx_merge[tx_merge['m5_Revenue']<tx_merge['m5_Revenue'].quantile(0.99)]


#creating 3 clusters
kmeans = KMeans(n_clusters=2)
kmeans.fit(tx_merge[['m5_Revenue']])
tx_merge['LTVCluster'] = kmeans.predict(tx_merge[['m5_Revenue']])

#order cluster number based on LTV
tx_merge = order_cluster('LTVCluster', 'm5_Revenue',tx_merge,True)

#creatinga new cluster dataframe
tx_cluster = tx_merge.copy()

#see details of the clusters
tx_cluster.groupby('LTVCluster')['m5_Revenue'].describe()
Out[263]:
count mean std min 25% 50% 75% max
LTVCluster
0 16665.0 1982.616263 3008.571123 0.00 8.51 606.65 2596.55 14854.54
1 981.0 27749.624393 13510.796309 14876.93 17948.93 22671.26 33156.48 75232.85
In [264]:
tx_cluster['LTVCluster'].value_counts()
Out[264]:
0    16665
1      981
Name: LTVCluster, dtype: int64
In [265]:
(981/17646)*100  ### distribution is highly skewed 
Out[265]:
5.61600362688428

LTVCluster == 1 is Engaged set (This cohurt is cash-cow)

LTVCluster == 0 is Unengaged set(Not generating much revenue)

Feature Engineering (we used 12 months of behaviour of user for predicting next 5 months Rev)

In [266]:
def trans_after_notified(rev_notfc,rev_trns):
    '''
    This function will take the transaction & notifcation merge data and tagged the customers 
    who are responded after 7 days of notification sent  
    '''
    df = rev_notfc.merge(rev_trns,how='left',on='user_id') ## notif + transac
    
    ## convert it to date format 
    df['notification_date'] =  pd.to_datetime(df['notification_date'], infer_datetime_format=True)
    df['transaction_date'] =  pd.to_datetime(df['transaction_date'], infer_datetime_format=True)
    
    ## date difference between notification sent and transaction happened and lastly into days 
    df['diff_trans_from_notified'] = df['notification_date'] - df['transaction_date']
    df['diff_trans_from_notified'] = df['diff_trans_from_notified'] / np.timedelta64(1, 'D')
    
    ## create column with after notification transaction happened in 7 days 
    df["after_notification_trans_in_7_days"]=np.where((-1 > df["diff_trans_from_notified"])&(df["diff_trans_from_notified"]>=-7),1,0)
    
    ## do groupby and create a categorical column 
    df_grp = df.groupby('user_id')['after_notification_trans_in_7_days'].sum().reset_index()
    print(df_grp.head(2))
    df_grp['is_trans'] = np.where(df_grp['after_notification_trans_in_7_days']>0,1,0)
    df_grp = df_grp[['user_id','after_notification_trans_in_7_days']] ## final data 
    return df_grp




def dummy_grp_features(rev_trns,rev_dvc,rev_notfc,rev_user):
    '''
    This function took all the data and do grouping on transaction data,
    and create a dummy columns and bind back to unique transaction data
    and join with other sources of data like users,device,notification 
    '''
    rev_trns_1 = pd.get_dummies(rev_trns['transactions_state'])
    rev_trns_2 = pd.get_dummies(rev_trns['direction'])
    rev_trns_grp = pd.concat([rev_trns.reset_index(drop=True), rev_trns_1,rev_trns_2], axis=1)
    rev_trns_grp1 = rev_trns.groupby('user_id')['transaction_id'].count().reset_index()
    rev_trns_grp1.rename(columns={'transaction_id': 'count_of transaction_id'},inplace=True)
    rev_trns_grp = rev_trns_grp.groupby('user_id')[['CANCELLED','COMPLETED','DECLINED','FAILED','PENDING','REVERTED']].sum().reset_index()
    rev_trns_grp = rev_trns_grp1.merge(rev_trns_grp,how='left',on='user_id')  ## join transaction data
    print('--- Grouping on Transaction data complete----')
    
    rev_notfc_1 = pd.get_dummies(rev_notfc['reason'])
    rev_notfc_2 = pd.get_dummies(rev_notfc['channel'])
    rev_notfc_3 = pd.get_dummies(rev_notfc['status'])
    rev_notfc_grp = pd.concat([rev_notfc.reset_index(drop=True), rev_notfc_1,rev_notfc_2,rev_notfc_3], axis=1)
    rev_notfc_grp1 = rev_notfc.groupby('user_id')['notification_date'].count().reset_index() ## count of notifivation sent to users
    rev_notfc_grp1.rename(columns={'notification_date': 'count_of notification_sent'},inplace=True)
    rev_notfc_grp = rev_notfc_grp.groupby('user_id')[['BLACK_FRIDAY','ENGAGEMENT_SPLIT_BILL_RESTAURANT',
                                           'FIFTH_PAYMENT_PROMO','JOINING_ANNIVERSARY','LOST_CARD_ORDER',
                                           'MADE_MONEY_REQUEST_NOT_SPLIT_BILL','METAL_GAME_START','METAL_RESERVE_PLAN',
                                           'NO_INITIAL_CARD_ORDER','NO_INITIAL_CARD_USE','NO_INITIAL_FREE_PROMOPAGE_CARD_ORDER',
                                           'ONBOARDING_TIPS_ACTIVATED_USERS','PREMIUM_ENGAGEMENT_FEES_SAVED','PREMIUM_ENGAGEMENT_INACTIVE_CARD',
                                           'PUMPKIN_PAYMENT_NOTIFICATION','REENGAGEMENT_ACTIVE_FUNDS','WELCOME_HOME','EMAIL','PUSH','SMS','FAILED','SENT']].sum().reset_index()
    rev_notfc_grp = rev_notfc_grp1.merge(rev_notfc_grp,how='left',on='user_id')  ## join notification data                                        
                                          
    
    r1 = rev_user.merge(rev_dvc,how='left',on='user_id')
    r2 = r1.merge(rev_notfc_grp,how='left',on='user_id')
    r3 = r2.merge(rev_trns_grp,how='left',on='user_id') 
    print('--- Merging Done, data prep done ---')
    return r3

def feature_engineering_pipeline(rev_notfc_tr,rev_trns_tr,rev_user,rev_dvc):
    '''
    Feature pipeline having all the function call for creating new features
    '''
    df1 = trans_after_notified(rev_notfc,rev_trns) ## within 7 days feature create  
    df_grp = dummy_grp_features(rev_trns_tr,rev_dvc,rev_notfc_tr,rev_user) ## all group by things will happen
    df_fnl = df_grp.merge(df1,how='left',on='user_id') ## all features concatenated 
    return df_fnl
In [267]:
start_date = '2018-01-01'
end_date = '2018-12-30'
mask = (rev_trns['transaction_date'] > start_date) & (rev_trns['transaction_date'] <= end_date) 
trns_12 = rev_trns.loc[mask] ## this data is the time data 
notfc_12 = rev_notfc.loc[mask] ## this data is the time data 
print('12 months of transaction & Notification data:',trns_12.shape,notfc_12.shape)
12 months of transaction & Notification data: (1384853, 13) (95314, 5)
In [268]:
tx_feature_df = feature_engineering_pipeline(notfc_12,trns_12,rev_user,rev_dvc) ## grp features done 

# since how many days user is with app 
max_date = rev_trns['transaction_date'].max()
tx_feature_df['created_date'] = pd.to_datetime(tx_feature_df['created_date'])
tx_feature_df['days_since_app'] = max_date - tx_feature_df['created_date']  
tx_feature_df['days_since_app'] = tx_feature_df['days_since_app'] / np.timedelta64(1, 'D')

## person's age who is using the app 
tx_feature_df['age']=2020 -tx_feature_df['birth_year']
tx_feature_df.head(2)
  user_id  after_notification_trans_in_7_days
0  user_0                                  28
1  user_1                                  32
--- Grouping on Transaction data complete----
--- Merging Done, data prep done ---
Out[268]:
user_id birth_year country city created_date user_settings_crypto_unlocked plan attributes_notifications_marketing_push attributes_notifications_marketing_email num_contacts ... count_of transaction_id CANCELLED COMPLETED DECLINED FAILED_y PENDING REVERTED after_notification_trans_in_7_days days_since_app age
0 user_0 1989 PL Gdansk 2018-01-13 05:15:15.599466 1 STANDARD 1.0 1.0 3 ... 254.0 0.0 144.0 15.0 3.0 0.0 2.0 28.0 488.546544 31
1 user_1 1975 GB London 2018-01-29 03:38:46.676876 0 STANDARD NaN NaN 21 ... 113.0 1.0 67.0 5.0 3.0 0.0 8.0 32.0 472.613545 45

2 rows × 46 columns

In [269]:
sns.distplot(tx_feature_df['age']) ## Normally distributed
Out[269]:
<matplotlib.axes._subplots.AxesSubplot at 0x20701ecc5f8>
In [270]:
sns.distplot(tx_feature_df['days_since_app']) ## majorly old users 
Out[270]:
<matplotlib.axes._subplots.AxesSubplot at 0x20702d4bb38>
In [271]:
print(tx_cluster.head(2)) ## rfm data
print(tx_feature_df.head(2)) ## other features
print(tx_cluster.shape,tx_feature_df.shape)
     user_id  Recency  RecencyCluster  Frequency  FrequencyCluster  Revenue  \
0   user_898        6               2        242                 1  6274.41   
1  user_4896        3               2        153                 1  4066.90   

   RevenueCluster  OverallScore    segment  m5_Revenue  LTVCluster  
0               0             3  Low-Value      232.65           0  
1               0             3  Low-Value     1277.49           0  
  user_id  birth_year country    city               created_date  \
0  user_0        1989      PL  Gdansk 2018-01-13 05:15:15.599466   
1  user_1        1975      GB  London 2018-01-29 03:38:46.676876   

   user_settings_crypto_unlocked      plan  \
0                              1  STANDARD   
1                              0  STANDARD   

   attributes_notifications_marketing_push  \
0                                      1.0   
1                                      NaN   

   attributes_notifications_marketing_email  num_contacts ...   \
0                                       1.0             3 ...    
1                                       NaN            21 ...    

   count_of transaction_id  CANCELLED COMPLETED  DECLINED  FAILED_y  PENDING  \
0                    254.0        0.0     144.0      15.0       3.0      0.0   
1                    113.0        1.0      67.0       5.0       3.0      0.0   

   REVERTED  after_notification_trans_in_7_days  days_since_app  age  
0       2.0                                28.0      488.546544   31  
1       8.0                                32.0      472.613545   45  

[2 rows x 46 columns]
(17646, 11) (19430, 46)
In [272]:
## merge everything 
final_df = tx_feature_df.merge(tx_cluster,how='inner',on='user_id')
final_df.shape
Out[272]:
(17646, 56)
In [273]:
final_df.head(2)
Out[273]:
user_id birth_year country city created_date user_settings_crypto_unlocked plan attributes_notifications_marketing_push attributes_notifications_marketing_email num_contacts ... Recency RecencyCluster Frequency FrequencyCluster Revenue RevenueCluster OverallScore segment m5_Revenue LTVCluster
0 user_0 1989 PL Gdansk 2018-01-13 05:15:15.599466 1 STANDARD 1.0 1.0 3 ... 8 2 254 1 7940.15 0 3 Low-Value 5743.62 0
1 user_1 1975 GB London 2018-01-29 03:38:46.676876 0 STANDARD NaN NaN 21 ... 7 2 113 0 5053.01 0 2 Low-Value 1050.87 0

2 rows × 56 columns

In [274]:
final_df['failed_notifc'] = final_df['FAILED_x']+final_df['FAILED_y']
In [275]:
final_df.columns
Out[275]:
Index(['user_id', 'birth_year', 'country', 'city', 'created_date',
       'user_settings_crypto_unlocked', 'plan',
       'attributes_notifications_marketing_push',
       'attributes_notifications_marketing_email', 'num_contacts',
       'num_referrals', 'num_successful_referrals', 'brand',
       'count_of notification_sent', 'BLACK_FRIDAY',
       'ENGAGEMENT_SPLIT_BILL_RESTAURANT', 'FIFTH_PAYMENT_PROMO',
       'JOINING_ANNIVERSARY', 'LOST_CARD_ORDER',
       'MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_GAME_START',
       'METAL_RESERVE_PLAN', 'NO_INITIAL_CARD_ORDER', 'NO_INITIAL_CARD_USE',
       'NO_INITIAL_FREE_PROMOPAGE_CARD_ORDER',
       'ONBOARDING_TIPS_ACTIVATED_USERS', 'PREMIUM_ENGAGEMENT_FEES_SAVED',
       'PREMIUM_ENGAGEMENT_INACTIVE_CARD', 'PUMPKIN_PAYMENT_NOTIFICATION',
       'REENGAGEMENT_ACTIVE_FUNDS', 'WELCOME_HOME', 'EMAIL', 'PUSH', 'SMS',
       'FAILED_x', 'SENT', 'count_of transaction_id', 'CANCELLED', 'COMPLETED',
       'DECLINED', 'FAILED_y', 'PENDING', 'REVERTED',
       'after_notification_trans_in_7_days', 'days_since_app', 'age',
       'Recency', 'RecencyCluster', 'Frequency', 'FrequencyCluster', 'Revenue',
       'RevenueCluster', 'OverallScore', 'segment', 'm5_Revenue', 'LTVCluster',
       'failed_notifc'],
      dtype='object')
In [279]:
final_df['LTVCluster'].value_counts()
Out[279]:
0    16665
1      981
Name: LTVCluster, dtype: int64
In [276]:
FEATURES = ['user_id','user_settings_crypto_unlocked','plan','attributes_notifications_marketing_push',
           'attributes_notifications_marketing_email','num_contacts','num_referrals', 'num_successful_referrals','brand',
           'count_of notification_sent','BLACK_FRIDAY','ENGAGEMENT_SPLIT_BILL_RESTAURANT', 'FIFTH_PAYMENT_PROMO',
           'JOINING_ANNIVERSARY', 'LOST_CARD_ORDER','MADE_MONEY_REQUEST_NOT_SPLIT_BILL', 'METAL_GAME_START',
       'METAL_RESERVE_PLAN', 'NO_INITIAL_CARD_ORDER', 'NO_INITIAL_CARD_USE',
       'NO_INITIAL_FREE_PROMOPAGE_CARD_ORDER','ONBOARDING_TIPS_ACTIVATED_USERS', 'PREMIUM_ENGAGEMENT_FEES_SAVED',
       'PREMIUM_ENGAGEMENT_INACTIVE_CARD', 'PUMPKIN_PAYMENT_NOTIFICATION',
       'REENGAGEMENT_ACTIVE_FUNDS', 'WELCOME_HOME', 'EMAIL', 'PUSH', 'SMS','PENDING', 'REVERTED',
       'SENT', 'count_of transaction_id', 'CANCELLED', 'COMPLETED','failed_notifc','after_notification_trans_in_7_days',
        'days_since_app', 'age','RecencyCluster','FrequencyCluster','RevenueCluster','LTVCluster']

       
In [280]:
## X,y 
df = final_df[FEATURES]
X  = df.drop(['user_id','LTVCluster'],axis=1)
y = df['LTVCluster']
In [281]:
y.value_counts()
Out[281]:
0    16665
1      981
Name: LTVCluster, dtype: int64
In [282]:
(981/16665)*100
Out[282]:
5.886588658865886
In [283]:
X.head(2)
Out[283]:
user_settings_crypto_unlocked plan attributes_notifications_marketing_push attributes_notifications_marketing_email num_contacts num_referrals num_successful_referrals brand count_of notification_sent BLACK_FRIDAY ... count_of transaction_id CANCELLED COMPLETED failed_notifc after_notification_trans_in_7_days days_since_app age RecencyCluster FrequencyCluster RevenueCluster
0 1 STANDARD 1.0 1.0 3 0 0 Apple 7.0 0.0 ... 254.0 0.0 144.0 4.0 28.0 488.546544 31 2 1 0
1 0 STANDARD NaN NaN 21 0 0 Apple 5.0 0.0 ... 113.0 1.0 67.0 5.0 32.0 472.613545 45 2 0 0

2 rows × 42 columns

In [284]:
X.describe()
Out[284]:
user_settings_crypto_unlocked attributes_notifications_marketing_push attributes_notifications_marketing_email num_contacts num_referrals num_successful_referrals count_of notification_sent BLACK_FRIDAY ENGAGEMENT_SPLIT_BILL_RESTAURANT FIFTH_PAYMENT_PROMO ... count_of transaction_id CANCELLED COMPLETED failed_notifc after_notification_trans_in_7_days days_since_app age RecencyCluster FrequencyCluster RevenueCluster
count 17646.000000 11349.000000 11349.000000 17646.000000 17646.0 17646.0 17045.000000 17045.000000 17045.000000 17045.000000 ... 17646.000000 17646.000000 17646.000000 17045.000000 17268.000000 17646.000000 17646.000000 17646.000000 17646.000000 17646.000000
mean 0.187408 0.944577 0.894881 17.883940 0.0 0.0 5.125139 0.077970 0.061074 0.812614 ... 76.449167 0.031622 42.470475 1.859196 19.934503 303.458400 36.460388 1.680778 0.160660 0.000397
std 0.390250 0.228815 0.306721 46.889434 0.0 0.0 3.901029 0.277593 0.250255 1.277858 ... 137.254592 0.186595 76.961851 2.139004 73.253916 102.807481 11.368238 0.602292 0.416814 0.024965
min 0.000000 0.000000 0.000000 0.000000 0.0 0.0 1.000000 0.000000 0.000000 0.000000 ... 1.000000 0.000000 0.000000 0.000000 0.000000 137.796377 19.000000 0.000000 0.000000 0.000000
25% 0.000000 1.000000 1.000000 0.000000 0.0 0.0 3.000000 0.000000 0.000000 0.000000 ... 6.000000 0.000000 4.000000 0.000000 0.000000 214.988567 28.000000 2.000000 0.000000 0.000000
50% 0.000000 1.000000 1.000000 5.000000 0.0 0.0 5.000000 0.000000 0.000000 1.000000 ... 27.000000 0.000000 15.000000 1.000000 6.000000 296.862501 34.000000 2.000000 0.000000 0.000000
75% 0.000000 1.000000 1.000000 19.000000 0.0 0.0 7.000000 0.000000 0.000000 1.000000 ... 84.000000 0.000000 47.000000 3.000000 20.000000 387.613036 43.000000 2.000000 0.000000 0.000000
max 1.000000 1.000000 1.000000 2918.000000 0.0 0.0 230.000000 3.000000 3.000000 61.000000 ... 2254.000000 3.000000 1335.000000 41.000000 7325.000000 500.402687 91.000000 2.000000 2.000000 2.000000

8 rows × 40 columns

Data cleaning and make it ready for Modeling

In [285]:
## grab categorical data
categorical_features = X.select_dtypes(include=['object'])
categorical_features = categorical_features.columns
## Encode the object data 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
categorical_features
Out[285]:
Index(['plan', 'brand'], dtype='object')
In [286]:
# Categorical boolean mask
categorical_feature_mask = X.dtypes==object
# filter categorical columns using mask and turn it into a list
categorical_cols = X.columns[categorical_feature_mask].tolist()
categorical_cols
Out[286]:
['plan', 'brand']
In [287]:
from sklearn.preprocessing import LabelEncoder
# instantiate labelencoder object
le = LabelEncoder()
# apply le on categorical feature columns
X[categorical_cols] = X[categorical_cols].apply(lambda col: le.fit_transform(col))
X[categorical_cols].head(2)
Out[287]:
plan brand
0 5 1
1 5 1
In [288]:
## fill the missing values with 0 (very naive way )
X.fillna(0,inplace=True)
In [289]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print('X_train & X_test shape:',X_train.shape,X_test.shape)
print('y_train & y_test distribution shape:',y_train.value_counts(),y_test.value_counts())
X_train & X_test shape: (14116, 42) (3530, 42)
y_train & y_test distribution shape: 0    13347
1      769
Name: LTVCluster, dtype: int64 0    3318
1     212
Name: LTVCluster, dtype: int64

Data Approach (random Over sampling I did not try SMOTE beacuase I have some incompatibility in py)

In [290]:
df_x = X_train
df_x['LTVCluster'] = y_train

# Class count
count_class_0, count_class_1 = df_x.LTVCluster.value_counts()

# Divide by class
df_class_0 = df_x[df_x['LTVCluster'] == 0]
df_class_1 = df_x[df_x['LTVCluster'] == 1]
df_class_0_under = df_class_0.sample(count_class_1)
df_test_under = pd.concat([df_class_0_under, df_class_1], axis=0)

df_class_1_over = df_class_1.sample(count_class_0, replace=True)
df_test_over = pd.concat([df_class_0, df_class_1_over], axis=0)

print('Random over-sampling:')
print(df_test_over.LTVCluster.value_counts())

df_test_over.LTVCluster.value_counts().plot(kind='bar', title='Count (target)');

X_train_smp = df_test_over.drop(['LTVCluster'],axis=1)
y_train_smp = df_test_over['LTVCluster']
print('After over sampling new training data:',X_train_smp.shape,y_train_smp.value_counts())
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

Random over-sampling:
1    13347
0    13347
Name: LTVCluster, dtype: int64
After over sampling new training data: (26694, 42) 1    13347
0    13347
Name: LTVCluster, dtype: int64

Feature Selection

Method-1:remove the correlated values
In [297]:
# find and remove correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr
 
corr_features = correlation(X_train_smp, 0.8)
print('correlated features: ', len(set(corr_features)) )

# removed correlated  features
X_train_smp.drop(labels=corr_features, axis=1, inplace=True)
X_test_smp.drop(labels=corr_features, axis=1, inplace=True)
# keep a copy of the dataset at  this stage
X_train_corr = X_train_smp.copy()
X_test_corr = X_test_smp.copy() 
correlated features:  11
Method-2:ROC-AUC based features
In [298]:
# find important features using univariate roc-auc
 
# loop to build a tree, make predictions and get the roc-auc
# for each feature of the train set
 
roc_values = []
for feature in X_train_corr.columns:
    clf = DecisionTreeClassifier()
    clf.fit(X_train_smp[feature].fillna(0).to_frame(), y_train_smp)
    y_scored = clf.predict_proba(X_test_smp[feature].fillna(0).to_frame())
    roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))

# let's add the variable names and order it for clearer visualisation
roc_values = pd.Series(roc_values)
roc_values.index = X_train_smp.columns
roc_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))
Out[298]:
<matplotlib.axes._subplots.AxesSubplot at 0x20701f27a90>
Method-3:Random Forest Based Feature
In [303]:
# select features using the impotance derived from
# random forests
 
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=400))
sel_.fit(X_train_smp, y_train_smp)
 
# remove features with zero coefficient from dataset
# and parse again as dataframe (output of sklearn is
# numpy array)
X_train_rf = pd.DataFrame(sel_.transform(X_train_smp))
X_test_rf = pd.DataFrame(sel_.transform(X_test_smp))
 
# add the columns name
X_train_rf.columns = X_train_smp.columns[(sel_.get_support())]
X_test_rf.columns = X_train_smp.columns[(sel_.get_support())]
print(X_train_rf.shape,X_test_rf.shape)
(26694, 8) (3530, 8)

Model_building

In [314]:
y_test.value_counts()
Out[314]:
0    3318
1     212
Name: LTVCluster, dtype: int64
In [ ]:
def model_builder(algo,X_train,y_train,X_test):
    algo.fit(X_train,y_train)
    y_pred = algo.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test,y_pred))
    print('==ROC-AUC==:',roc_auc_score(y_test,y_pred))
    return y_pred
In [308]:
# Scale for Logistic Regression 
scaler = MinMaxScaler()
X_train_mt = scaler.fit_transform(X_train_smp)
X_test_mt = scaler.fit_transform(X_test_smp)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\preprocessing\data.py:334: DataConversionWarning:

Data with input dtype int32, int64, float64 were all converted to float64 by MinMaxScaler.

C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\preprocessing\data.py:334: DataConversionWarning:

Data with input dtype int32, int64, float64 were all converted to float64 by MinMaxScaler.

In [312]:
## all set of features we want Logistic (origial set of features)
algo = LogisticRegression(penalty='l1') 
model_builder(algo,X_train_smp,y_train_smp,X_test)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning:

Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.

              precision    recall  f1-score   support

           0       0.97      0.82      0.89      3318
           1       0.18      0.62      0.28       212

   micro avg       0.81      0.81      0.81      3530
   macro avg       0.57      0.72      0.58      3530
weighted avg       0.92      0.81      0.85      3530

[[2711  607]
 [  81  131]]
==ROC-AUC==: 0.717491498629545
Out[312]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [316]:
## without corr
model_builder(algo,X_train_corr,y_train_smp,X_test_corr)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning:

Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.

              precision    recall  f1-score   support

           0       0.97      0.82      0.89      3318
           1       0.18      0.62      0.28       212

   micro avg       0.81      0.81      0.81      3530
   macro avg       0.57      0.72      0.58      3530
weighted avg       0.92      0.81      0.85      3530

[[2712  606]
 [  81  131]]
==ROC-AUC==: 0.7176421918182129
Out[316]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [317]:
## rf 
model_builder(algo,X_train_rf,y_train_smp,X_test_rf)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning:

Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.

              precision    recall  f1-score   support

           0       0.97      0.84      0.90      3318
           1       0.19      0.60      0.29       212

   micro avg       0.82      0.82      0.82      3530
   macro avg       0.58      0.72      0.59      3530
weighted avg       0.92      0.82      0.86      3530

[[2774  544]
 [  84  128]]
==ROC-AUC==: 0.7199096978175077
Out[317]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

Tree Based Algo

In [321]:
algo=xgb.XGBClassifier(max_depth=5, learning_rate=0.1,n_jobs=-1)

_ = model_builder(algo,X_train_rf,y_train_smp,X_test_rf)
              precision    recall  f1-score   support

           0       0.97      0.83      0.89      3318
           1       0.17      0.55      0.26       212

   micro avg       0.81      0.81      0.81      3530
   macro avg       0.57      0.69      0.58      3530
weighted avg       0.92      0.81      0.85      3530

[[2748  570]
 [  96  116]]
==ROC-AUC==: 0.6876897881196903
In [323]:
algo = RandomForestClassifier(max_depth=3)
model_builder(algo,X_train_rf,y_train_smp,X_test_rf)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning:

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.

              precision    recall  f1-score   support

           0       0.97      0.79      0.87      3318
           1       0.17      0.66      0.27       212

   micro avg       0.78      0.78      0.78      3530
   macro avg       0.57      0.72      0.57      3530
weighted avg       0.92      0.78      0.84      3530

[[2621  697]
 [  73  139]]
==ROC-AUC==: 0.7227970361777385
Out[323]:
array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

Algorithmic based weighted

In [324]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
print('X_train & X_test shape:',X_train.shape,X_test.shape)
print('y_train & y_test distribution shape:',y_train.value_counts(),y_test.value_counts())
X_train & X_test shape: (14116, 42) (3530, 42)
y_train & y_test distribution shape: 0    13347
1      769
Name: LTVCluster, dtype: int64 0    3318
1     212
Name: LTVCluster, dtype: int64
Method-1: Rf based features
In [325]:
# select features using the impotance derived from
# random forests
 
sel_ = SelectFromModel(RandomForestClassifier(n_estimators=400))
sel_.fit(X_train, y_train)
 
# remove features with zero coefficient from dataset
# and parse again as dataframe (output of sklearn is
# numpy array)
X_train_rf = pd.DataFrame(sel_.transform(X_train))
X_test_rf = pd.DataFrame(sel_.transform(X_test))
 
# add the columns name
X_train_rf.columns = X_train.columns[(sel_.get_support())]
X_test_rf.columns = X_train.columns[(sel_.get_support())]
print(X_train_rf.shape,X_test_rf.shape)
(14116, 14) (3530, 14)
In [326]:
# find important features using univariate roc-auc
 
# loop to build a tree, make predictions and get the roc-auc
# for each feature of the train set
 
roc_values = []
for feature in X_train.columns:
    clf = DecisionTreeClassifier()
    clf.fit(X_train[feature].fillna(0).to_frame(), y_train)
    y_scored = clf.predict_proba(X_test[feature].fillna(0).to_frame())
    roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))

# let's add the variable names and order it for clearer visualisation
roc_values = pd.Series(roc_values)
roc_values.index = X_train.columns
roc_values.sort_values(ascending=False).plot.bar(figsize=(20, 8))
Out[326]:
<matplotlib.axes._subplots.AxesSubplot at 0x2070352dc50>

Impute class_weight hyperparameter & scale_pos_weights for RF & Xgboost Models

In [328]:
y_train.value_counts()
Out[328]:
0    13347
1      769
Name: LTVCluster, dtype: int64
In [329]:
scale_pos_weight = 13347/769
scale_pos_weight
Out[329]:
17.35630689206762
In [330]:
from sklearn.utils import class_weight
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(y_train),
                                                 y_train)
class_weights
Out[330]:
array([0.52880797, 9.17815345])
In [331]:
## rf 
algo = RandomForestClassifier(max_depth=3,class_weight={0:0.52880797,1:9.17815345})
model_builder(algo,X_train_rf,y_train,X_test_rf)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\ensemble\forest.py:246: FutureWarning:

The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.

              precision    recall  f1-score   support

           0       0.98      0.79      0.87      3318
           1       0.17      0.70      0.28       212

   micro avg       0.78      0.78      0.78      3530
   macro avg       0.58      0.75      0.58      3530
weighted avg       0.93      0.78      0.84      3530

[[2615  703]
 [  63  149]]
==ROC-AUC==: 0.7454777827061084
Out[331]:
array([0, 0, 0, ..., 0, 1, 0], dtype=int64)
In [332]:
algo = LogisticRegression(penalty='l1',class_weight={0:0.52880797,1:9.17815345})
model_builder(algo,X_train_rf,y_train,X_test_rf)
C:\Users\mainak.kundu\AppData\Local\Continuum\anaconda2\envs\lstm_test_env\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning:

Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.

              precision    recall  f1-score   support

           0       0.98      0.86      0.91      3318
           1       0.23      0.66      0.34       212

   micro avg       0.84      0.84      0.84      3530
   macro avg       0.60      0.76      0.63      3530
weighted avg       0.93      0.84      0.88      3530

[[2842  476]
 [  72  140]]
==ROC-AUC==: 0.7584587214393759
Out[332]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [342]:
### Predict test_y values and probabilities based on fitted logistic 
#regression model

pred_y=algo.predict(X_test_rf) 

probs_y=algo.predict_proba(X_test_rf)


from sklearn.metrics import accuracy_score
precision, recall, thresholds = precision_recall_curve(y_test, probs_y[:, 
1]) 
   #retrieve probability of being 1(in second column of probs_y)
#pr_auc = metrics.auc(recall, precision)

plt.title("Precision-Recall vs Threshold Chart")
plt.plot(thresholds, precision[: -1], "b--", label="Precision")
plt.plot(thresholds, recall[: -1], "r--", label="Recall")
plt.ylabel("Precision, Recall")
plt.xlabel("Threshold")
plt.legend(loc="lower left")
plt.ylim([0,1])
Out[342]:
(0, 1)

A/B testing

Hypothesis is:

1.Group A will exposed to offer and on High Retention
2.Group B will not exposed to offer and Low Retention 
This also helps us to test model accuracy as well. If group B’s retention rate is 50%, it clearly shows that our model is    not working.
Sucess_Metric = Retention rate 
In [343]:
df_hv = pd.DataFrame()
df_hv['customer_id'] = np.array([count for count in range(20000)])
df_hv['segment'] = np.array(['high-value' for _ in range(20000)])
df_hv['group'] = 'control'
df_hv.loc[df_hv.index<10000,'group'] = 'test'
In [347]:
df_hv.head(2)
Out[347]:
customer_id segment group purchase_count
0 0 high-value test 0.0
1 1 high-value test 1.0
Ideally, purchase count should be a Poisson distribution. There will be customers with no purchase and we will have less customers with high purchase counts. Let’s use numpy.random.poisson() for doing that and assign different distributions to test and control group
In [345]:
df_hv.loc[df_hv.group == 'test', 'purchase_count'] = np.random.poisson(0.6, 10000)
df_hv.loc[df_hv.group == 'control', 'purchase_count'] = np.random.poisson(0.5, 10000)
In [348]:
test_results = df_hv[df_hv.group == 'test'].purchase_count
control_results = df_hv[df_hv.group == 'control'].purchase_count

hist_data = [test_results, control_results]

group_labels = ['test', 'control']

Assume we applied an offer to 50% of high-value users and observed their purchases in a given period. Best way to visualize it to check the densities:

H0:No signifiant difference between 2 groups

H1:Significant difference between the groups

In [351]:
from scipy import stats 
def eval_test(test_results,control_results):
    test_result = stats.ttest_ind(test_results, control_results)
    if test_result[1] < 0.05:
        print('result is significant')
    else:
        print('result is not significant')
In [352]:
eval_test(test_results,control_results)
result is significant

If you select a biased test group, your results will be statistically significant by default. As an example, if we allocate more high-value customer to test group and more low-value customers to control group, then our experiment becomes a failure from the beginning. That’s why selecting the group is the key to a healthy A/B test.